# -*- coding: UTF-8 -*-
import numpy as np
import random

from agents.agent import Agent


class DQN_FT(Agent):
    
    def __init__(self, model_lambda, buffer, pretrained_model, *args, target_update_ev=1000, test_epsilon=0.03, **kwargs):
        """
        Creates a new DQN agent that supports universal value function approximation (UVFA), for fine tuning a pre-trained model
        
        Parameters
        ----------
        model_lambda : function
            returns a keras Model instance
        buffer : ReplayBuffer
            a replay buffer that implements randomized experience replay
        pretrained_model: keras Model
            a pre-trained model to be loaded to DQN agent
        target_update_ev : integer
            how often to update the target network (defaults to 1000)
        test_epsilon : float
            the exploration parameter for epsilon greedy used during testing 
            (defaults to 0.03 as in the paper)
        """
        super(DQN_FT, self).__init__(*args, **kwargs)
        self.model_lambda = model_lambda
        self.pretrained_model = pretrained_model
        self.buffer = buffer
        self.target_update_ev = target_update_ev
        self.test_epsilon = test_epsilon
    
    def reset(self):
        Agent.reset(self)
        self.Q = self.model_lambda()
        self.target_Q = self.model_lambda()
        self.target_Q.set_weights(self.Q.get_weights())
        self.buffer.reset()
        self.updates_since_target_updated = 0
    
    def init_pretrained_model(self):
        self.Q = self.model_lambda()
        self.Q.set_weights(self.pretrained_model.get_weights())
        self.Q.layers[-1].trainable = False # freeze last layer
        self.target_Q = self.model_lambda()
        self.target_Q.set_weights(self.Q.get_weights())
        self.target_Q.layers[-1].trainable = False # freeze last layer (probably not needed)
        self.buffer.reset()
        self.updates_since_target_updated = 0
        
        
    def get_Q_values(self, s, s_enc):
        return self.Q.predict_on_batch(s_enc)
    
    def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma):
        
        # remember this experience
        self.buffer.append(s_enc, a, r, s1_enc, gamma)
        
        # sample experience at random
        batch = self.buffer.replay()
        if batch is None: return
        states, actions, rewards, next_states, gammas = batch
        n_batch = self.buffer.n_batch
        indices = np.arange(n_batch)
        rewards = rewards.flatten()

        # main update
        next_actions = np.argmax(self.Q.predict_on_batch(next_states), axis=1)
        targets = self.Q.predict_on_batch(states)
        targets[indices, actions] = rewards + \
            gammas * self.target_Q.predict_on_batch(next_states)[indices, next_actions]
        self.Q.train_on_batch(states, targets)
        
        # target update
        self.updates_since_target_updated += 1
        if self.updates_since_target_updated >= self.target_update_ev:
            self.target_Q.set_weights(self.Q.get_weights())
            self.updates_since_target_updated = 0
    
    def train(self, train_tasks, n_samples, viewers=None, n_view_ev=None, test_tasks=[], n_test_ev=1000):
        if viewers is None: 
            viewers = [None] * len(train_tasks)
            
        # add tasks
        self.reset()
        for train_task in train_tasks:
            self.add_training_task(train_task)
            
        # train each one
        return_data = []
        for index, (train_task, viewer) in enumerate(zip(train_tasks, viewers)):
            # reset to the pretrained model (hidden layers frozen)
            self.init_pretrained_model()
            self.set_active_training_task(index)
            # set test task, a different instance of the training task
            test_task = test_tasks[index]
            for t in range(n_samples):
                
                # train
                self.next_sample(viewer, n_view_ev)
                
                # test
                if t % n_test_ev == 0:
                    # test on the training task
                    R = self.test_agent(test_task)
                    return_data.append(R)
                    print('test performance: {}'.format(R))
        return return_data
    
    def get_test_action(self, s_enc):
        if random.random() <= self.test_epsilon:
            a = random.randrange(self.n_actions)
        else:
            q = self.get_Q_values(s_enc, s_enc)
            a = np.argmax(q)
        return a
            
    def test_agent(self, task):
        R = 0.
        s = task.initialize()
        s_enc = self.encoding(s)
        for _ in range(self.T):
            a = self.get_test_action(s_enc)
            s1, r, done = task.transition(a)
            s1_enc = self.encoding(s1)
            s, s_enc = s1, s1_enc
            R += r
            if done:
                break
        return R
